In [178]:
import json
import codecs
import math
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from pylab import *
import seaborn as sns
from IPython.display import display, HTML
%matplotlib inline
#from mpltools import style
#from mpltools import layout
#style.use('ggplot')
#load proposals
proposals = json.loads(codecs.open(os.path.join("data","proposals.json"),"r").read())
In [18]:
sessions_df = pd.DataFrame(proposals['sessions'])
sessions_df.head(2)
Out[18]:
In [19]:
themes_df = pd.DataFrame(proposals['themes'])
themes_df.head()
Out[19]:
In [8]:
display(HTML("<p>This year, the Mozilla Festival received {0} proposals in {1} areas.</p>".format(len(proposals['sessions']), len(proposals['themes']))))
In [58]:
themes_df.sort('totalProposals',ascending=True).plot('name','totalProposals',
kind='barh',
title='Number of proposals per Topic, #MozFest 2014')
Out[58]:
In [68]:
sessions_df.head()
sessions_df = sessions_df.replace(to_replace="Open Knowledge/ School of Data",value="Open Knowledge")
sessions_df[sessions_df['organization'] == "Open Knowledge"]
Out[68]:
In [88]:
sessions_gb = sessions_df.groupby('organization')
org_count = sessions_gb.aggregate({'organization':len}).sort('organization',ascending=False)
org_count.head()
Out[88]:
In [78]:
display(HTML("<p>This year, {0} organizations proposed sessions, with {1} orgs proposing more than one session.</p>".format(len(org_count)-1,len(org_count[org_count['organization'] > 1]))))
In [85]:
display(HTML("<p>{0} sessions, or {1:.2f}% of all sessions had no organization listed. </p>".format(org_count.ix['']['organization'],100*org_count.ix['']['organization']/float(len(sessions_df)))))
In [104]:
org_count[org_count['organization'] > 2].sort('organization',ascending=True).plot(kind='barh',figsize=(3,6),title='Proposals from orgs with >2 proposals, #MozFest 2014')
plt.legend('')
plt.ylabel('')
plt.xlabel('Submissions')
Out[104]:
In [159]:
def _uniques(Series):
return len(set(Series))
def _avg_count(Series):
return np.mean(Series.apply(len))
sessions_gb_theme = sessions_df.groupby('themeSlug')
theme_count_df = sessions_gb_theme.aggregate({'organization':_uniques,'title':len,'facilitators':_avg_count})
theme_count_df
Out[159]:
In [176]:
# For some reason the colormap isn't working
ax = theme_count_df.plot(x='title',y='organization',
kind='scatter',
colormap='autumn_r',
c=theme_count_df['facilitators'],
s=20*np.e**theme_count_df['facilitators'],
figsize=(8,4))
for i in theme_count_df.index:
x = theme_count_df.ix[i]['title']
y = theme_count_df.ix[i]['organization']
plt.annotate(
i,
xy = (x, y), xytext = (-20, 20),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
arrowprops = dict(arrowstyle = '->',color='black', connectionstyle = 'arc3,rad=0'))
plt.xlabel('Number of Sessions Proposed per Theme')
plt.ylabel('Number of Proposing Organizations per Theme')
title('Number of Session Proposals and Proposing Organizations by Theme, #MozFest 2014\n (the color of dots and number on labels is the avg session facilitator count)')
Out[176]:
In [179]:
from byline_gender import BylineGender
bg = BylineGender()
people = {}
for session in proposals['sessions']:
org = session['organization'] #not necessarily membership, maybe collaboration
for facilitator in session['facilitators']:
name = facilitator['name']
twitter = facilitator['twitter']
if(len(name)==0 and len(twitter)>0):
name = twitter
if name not in people.keys():
people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
people[name]['sessions'].append(session)
if len(org)>0 and org not in people[name]['orgs']:
people[name]['orgs'].append(org)
if len(twitter)>0 and twitter not in people[name]['twitter']:
people[name]['twitter'].append(twitter)
In [5]:
import unicodedata
import string
def get_org_name(org,name):
asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
org = ''.join(ch for ch in org if ch not in string.punctuation)
return (org,asciiname)
from byline_gender import BylineGender
bg = BylineGender()
people = {}
for session in proposals['sessions']:
org = session['organization'] #not necessarily membership, maybe collaboration
for facilitator in session['facilitators']:
name = facilitator['name']
twitter = facilitator['twitter']
if(len(name)==0 and len(twitter)>0):
name = twitter
if name not in people.keys():
people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
people[name]['sessions'].append(session)
if len(org)>0 and org not in people[name]['orgs']:
people[name]['orgs'].append(org)
if len(twitter)>0 and twitter not in people[name]['twitter']:
people[name]['twitter'].append(twitter)
#ONE TIME ONLY: Generate Name CSV to import to Google Spreadsheets
#TODO: SAVE ACTUAL ASCII NAMES AND ORG NAMES
#orgnames = {}
#for name in people.keys():
# if len(name)==0:
# continue
# person = people[name]
# if(len(person['orgs'])>0):
# org = person['orgs'][0]
# elif(len(person['twitter'])>0):
# org = person['twitter'][0]
# #alas, the python version of Open Gender Tracker is not unicode safe :p
# #asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
# #asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
# #org = ''.join(ch for ch in org if ch not in string.punctuation)
# org,asciiname = get_org_name(org,name)
# if org not in orgnames.keys():
# orgnames[org]={}
# if asciiname not in orgnames[org].keys():
# orgnames[org][asciiname] = len(person['sessions'])
#f = codecs.open("mozfest_org_names.csv","w", "utf-8")
#bg.export_org_names(orgnames,f)
#f.close()
In [35]:
#GENERATE A DATASET OF GENDER PER THEME
theme_people = {}
for theme in proposals['themes']:
slug = theme['slug']
#filter by sessions that have the current theme
sessions = [x for x in proposals['sessions'] if 'themeSlug' in x.keys() and x['themeSlug'] == slug]
#initialize hash
if slug not in theme_people.keys():
theme_people[slug] ={"facilitators":[],
"inclusive":{"female":0,"male":0,"unknown":0,"total":0},
"unique":{"female":0,"male":0,"unknown":0,"total":0},
}
for session in sessions:
org = ""
if(len(session['organization'])>0):
org = session['organization']
for person in session['facilitators']:
if(len(org)==0 and len(person['twitter'])>0):
org = person['twitter']
name = person['name']
asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
org = ''.join(ch for ch in org if ch not in string.punctuation)
inferred_gender = bg.org_name_gender(org,asciiname)
if(inferred_gender != "ignore"):
theme_people[slug]['inclusive'][inferred_gender] += 1
theme_people[slug]['inclusive']['total']+=1
if name not in theme_people[slug]['facilitators']:
theme_people[slug]['unique'][inferred_gender]+=1
theme_people[slug]['unique']['total']+=1
theme_people[slug]['facilitators'].append(name)
#generate gender specific series
def pct(a,b):
return 100.*(float(a)/float(b))
themes = [x[0] for x in sorted([(x,theme_people[x]['inclusive']) for x in theme_people.keys()],key=lambda x: x[1],reverse=True)]
female = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
male = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown_bottom = {'unique':[], 'inclusive':[],'unique_pct':[],'inclusive_pct':[]}
for theme in themes:
for k in ['unique','inclusive']:
female[k].append(theme_people[theme][k]['female'])
male[k].append(theme_people[theme][k]['male'])
unknown[k].append(theme_people[theme][k]['unknown'])
unknown_bottom[k].append(female[k][-1] + male[k][-1])
female[k+"_pct"].append(pct(female[k][-1],theme_people[theme][k]['total']))
male[k+"_pct"].append(pct(male[k][-1],theme_people[theme][k]['total']))
unknown[k+"_pct"].append(pct(unknown[k][-1],theme_people[theme][k]['total']))
unknown_bottom[k+'_pct'].append(female[k+"_pct"][-1] + male[k+"_pct"][-1])
In [7]:
sum([len(theme_people[x]['facilitators']) for x in theme_people.keys()])
Out[7]:
In [8]:
ind = np.arange(len(themes))
width = 0.4 # the width of the bars: can also be len(x) sequence
for j in ['unique','inclusive']:
for a in ['','_pct']:
k = j+a
fig = plt.figure(figsize=(14, 8))
ax = fig.add_subplot(111)
ax = fig.add_subplot(111)
#print "{0},{1},{2}".format(len(female[k]),len(male[k]),len(unknown_bottom[k]))
p1 = ax.bar(ind, female[k], width, color='#48C8B8')
p2 = ax.bar(ind, male[k], width, color='#E8CA33',
bottom=female[k])
p3 = ax.bar(ind, unknown[k], width, color='#cccccc',
bottom=unknown_bottom[k])
plt.xticks(ind+width/2.)
if(a =="_pct"):
plt.yticks(np.arange(0,101,10))
plt.ylim(ymax = 100, ymin = 0)
ax.set_xticklabels(themes, rotation=45, fontsize=18,ha='center')
plt.ylabel("Number of {0} session facilitators".format(j))
plt.title("Inferred Sex of #MozFest 2014 proposed session facilitators ({0})\n".format(j), fontsize=16)
plt.legend( (p1[0], p2[0],p3[0]), ('Female', 'Male',"Unknown"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )
plt.show()
In [ ]:
# ADD GENDER TO SESSION HASH
for i in np.arange(0,len(proposals['sessions'])):
proposals['sessions'][i][u'gender']={u'female':0,u'male':0,u'unknown':0,u'total':0}
for person in proposals['sessions'][i]['facilitators']:
name = person['name']
org = proposals['sessions'][i]['organization']
asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
org = ''.join(ch for ch in org if ch not in string.punctuation)
inferred_gender = bg.org_name_gender(org,asciiname)
if(inferred_gender != "ignore"):
proposals['sessions'][i][u'gender'][inferred_gender]+=1
proposals['sessions'][i][u'gender'][u'total']+=1
In [37]:
f = codecs.open("mozilla_session_gender.csv","w","utf8")
f.write(','.join(["slug","org","title","female","male","unknown","total"])+"\n")
for session in proposals['sessions']:
if 'themeSlug' in session.keys() and 'organization' in session.keys():
org = ''.join(ch for ch in session['organization'] if ch not in string.punctuation)
title = ''.join(ch for ch in session['title'] if ch not in string.punctuation)
gender = session['gender']
f.write(','.join([session['themeSlug'],org,title,str(gender[u'female']),str(gender[u'male']),str(gender[u'unknown']),str(gender[u'total'])]) + "\n")
f.close()
In [ ]: